Data Loading

Load required Libraries

rm(list=ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(magrittr)
library(pcaPP)
library(directlabels)
library(proto)

Load in Wordbank tata

## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname='wordbank',host="54.149.39.46",
                      user="wordbank",password="wordbank")

## NOW LOAD TABLES ##
source.table <- tbl(wordbank,"common_source")
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wordmapping.table <- tbl(wordbank,"common_wordmapping")
instruments.table <- tbl(wordbank,"common_instrumentsmap")
english.ws.table <- tbl(wordbank,"instruments_english_ws")
spanish.ws.table <- tbl(wordbank,"instruments_spanish_ws")
norwegian.ws.table <- tbl(wordbank,"instruments_norwegian_ws")
danish.ws.table <- tbl(wordbank,"instruments_danish_ws")

Get kid data and put together.

# Get administration info
admins <- admin.table %>%
  select(data_id,child_id,age,source_id) %>%
  rename(id = data_id, child.id = child_id, source.id = source_id) 
admins <- as.data.frame(admins)

# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
  rename(child.id = id) # Rename id fields
demos <- as.data.frame(demos)

# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))

Set up mappings and instruments.

mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
  rename(instrument_id = id)
items <- left_join(mapping, instruments)

Fucntion for getting all of the data in wordbank for a given language (kid x item).

get.language.data <- function(lang.table, lang.items, lang, child.data) {
  
  instrument.items <- lang.items %>% 
    filter(language == lang, form == 'WS') %>%
    select(item, type, category, lexical_category) %>%
    mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
  
  instrument.data <- as.data.frame(lang.table) %>%
    rename(id = basetable_ptr_id) %>% # Rename the id
    gather(item, value, -id) %>% # Arrange in longform
    mutate(item = str_replace(item, "item_", "")) # Strip off item_ 
  
  d <- left_join(instrument.data, instrument.items)
  d <- left_join(d, child.data)
}

Get kid x item data for all languages.

d.english <- get.language.data(lang.table=english.ws.table, 
                               lang.items=items, 
                               lang="English",
                               child.data)

d.spanish <- get.language.data(lang.table=spanish.ws.table, 
                               lang.items=items, 
                               lang="Spanish",
                               child.data)

d.norwegian <- get.language.data(lang.table=norwegian.ws.table, 
                                 lang.items=items, 
                                 lang="Norwegian",
                                 child.data)

# Norwegian data is loaded in funny -- NAs in wordform are actually 0s
d.norwegian[d.norwegian$type %in% c("word_form","word")
            & is.na(d.norwegian$value),]$value = ""

d.danish <- get.language.data(lang.table=danish.ws.table, 
                              lang.items=items, 
                              lang="Danish",
                              child.data)

# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
         & is.na(d.danish$value),]$value = ""

Function for getting vocab size data.

language.vocab.sizes <- function(lang.data) {
  d.vocab <- lang.data %>%
    filter(type == "word") %>%
    group_by(age,id) %>%
    summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
              vocab.mean = vocab.sum/length(value))
  
  return(d.vocab)
}

Syntax and Morphology Analyses

Function for getting kid x {vocab size, syntax score, morphology score} data.

According to Virginia, NAs here are just “my kid doesn’t say that,” and should be scored as 1s. So now this is computed as the total number of “complex” divided by the length.

summarise.language.data <- function(lang.data,lang) {
  
  d.vocab <- language.vocab.sizes(lang.data)
  
  d.complexity <- lang.data %>%
    filter(type == "complexity") %>%
    group_by(id) %>%
    summarise(all.na = all(is.na(value)),
              complexity.sum = sum(value == "complex", 
                               na.rm=TRUE) / length(value)) %>%
    mutate(complexity = ifelse(all.na,NA,complexity.sum)) %>%
    select(-all.na,-complexity.sum) # Deals with ifelse 
                                    # forcing values to logical
           
  d.wordform <- lang.data %>%
    filter(type == "word_form") %>%
    group_by(id) %>%
    summarise(all.na = all(is.na(value)),
              wordform.sum = sum(value == "produces", 
                               na.rm=TRUE) / length(value)) %>%
    mutate(wordform = ifelse(all.na,NA,wordform.sum)) %>%
    select(-all.na,-wordform.sum) # Deals with ifelse 
                                    # forcing values to logical
  
  # Spanish doesn't have ending data, so its skipped, at least for now.
  #   d.ending <- d %>%
  #     filter(type %in% c("ending")) %>%
  #     group_by(id) %>%
  #     summarise(ending_sometimes = mean(value == "sometimes" | 
  #                                       value == "often", 
  #                                       na.rm=TRUE), 
  #               ending_often = mean(value == "often", 
  #                                   na.rm=TRUE))
  #  d.composite <- left_join(d.composite, d.ending)
  
  d.composite <- left_join(d.vocab, d.complexity)
  d.composite <- left_join(d.composite, d.wordform) 
#   %>%
#     filter(num.complexity.na == 0) %>%
#     select(-num.complexity.na)
#   
  d.composite$language <- lang 
  
  return(d.composite)
}

Get kid x {vocab size, syntax score, morphology score} data for all languages and aggregate them.

summary.english <- summarise.language.data(d.english,"English")
summary.spanish <- summarise.language.data(d.spanish,"Spanish")
summary.norwegian <- summarise.language.data(d.norwegian,"Norwegian")
summary.danish <- summarise.language.data(d.danish,"Danish")

summary.data <- rbind_list(summary.english,summary.spanish,
                           summary.norwegian,summary.danish) %>%
  filter(age > 15 & age < 33) %>%
  mutate(age.group = cut(age, breaks = c(15, 20, 24, 28, 32)),
         language = factor(language,
                           levels=c("English", "Spanish", 
                                    "Norwegian", "Danish")))
# gather for plotting
ms <- summary.data %>% gather(measure, score, complexity:wordform) %>%
  mutate(measure = factor(measure, levels = c("wordform","complexity"),
                          labels = c("Word Form", "Complexity")))


ms %>% 
  group_by(language, age.group) %>% 
  summarise(n = n())
## Source: local data frame [16 x 3]
## Groups: language
## 
##     language age.group    n
## 1    English   (15,20] 4596
## 2    English   (20,24] 2468
## 3    English   (24,28] 2944
## 4    English   (28,32] 1264
## 5    Spanish   (15,20]  706
## 6    Spanish   (20,24]  608
## 7    Spanish   (24,28]  594
## 8    Spanish   (28,32]  280
## 9  Norwegian   (15,20] 3394
## 10 Norwegian   (20,24] 6438
## 11 Norwegian   (24,28] 6196
## 12 Norwegian   (28,32] 5418
## 13    Danish   (15,20] 1842
## 14    Danish   (20,24] 1754
## 15    Danish   (24,28] 1420
## 16    Danish   (28,32] 1346

Using Age and Vocab to predict Morphology and Syntax Scores.

quartz(width=8,height=7.5)
ggplot(ms, aes(x = vocab.mean, y = score, colour = age.group, fill = age.group,
              label = age.group)) + 
  #geom_point(alpha=.5, size=.8) + 
  geom_jitter(alpha=.5,size=.8) +
  geom_smooth(method="lm", formula = y ~ I(x^2) - 1) + 
  facet_grid(language~measure) + 
  scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
                     name = "Vocabulary Size") + 
  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
                     "Score (Mean Items)") + 
  theme_bw(base_size = 14) +
  scale_color_brewer(palette="Set1") +
  scale_fill_brewer(palette="Set1") 

Using Morphology scores to Predict Syntax scores.

quartz(width=8,height=7.5)
ggplot(summary.data,aes(x = wordform, y = complexity, fill=age.group,colour=age.group,
                        label=age.group)) + 
  facet_wrap( ~ language) +
  geom_jitter(size=1)+
  geom_smooth(method="lm", formula = y ~ exp(x) - 1) + 
  scale_x_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),
                     name = "Morphology Score") + 
  scale_y_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),"Syntax Score") + 
  scale_color_brewer(palette="Set1") +
  scale_fill_brewer(palette="Set1") +
  theme_bw(base_size = 14)

Vocabulary Composition Analysis

Function for computing vocabulary composition for each speaker of a language.

vocab.composition <- function(lang.data,lang) {  
  
  d.vocab <- language.vocab.sizes(lang.data)
  
  d.cat <- lang.data %>%
    filter(type == "word") %>%
    group_by(id,lexical_category) %>%
    summarise(cat = sum(value == "produces", na.rm=TRUE))
  
  d.vocab.comp <- left_join(d.vocab, d.cat) %>%
    mutate(prop = cat / vocab.sum) %>%
    select(-cat) 
  d.vocab.comp$language = lang
  
  return(d.vocab.comp)
}

Function for computing CDI form composition for all languages.

lang.vocab.composition <- function(lang.items) {  
  
  lang.words <- lang.items %>%
    filter(form == "WS",type=="word")
  
  lang.num.total <- lang.words %>%
    group_by(language) %>%
    summarise(n = n())
  
  lang.vocab.comp <-  lang.words %>%
    group_by(language,lexical_category) %>%
    summarise(num.per.cat = n())
  
  lang.vocab.comp <- left_join(lang.vocab.comp, lang.num.total) %>%
    mutate(prop.per.cat = num.per.cat/n)
  
  return(lang.vocab.comp)
  
  }

Get vocabulary composition data for all languages.

# get form compositions
lang.vocab.comp <- lang.vocab.composition(items) %>%
  filter(lexical_category != "other")

# get data for kids in each language
vocab.comp.english <- vocab.composition(d.english,"English")
vocab.comp.spanish <- vocab.composition(d.spanish,"Spanish")
vocab.comp.norwegian <- vocab.composition(d.norwegian,"Norwegian")
vocab.comp.danish <- vocab.composition(d.danish,"Danish")

# aggregate data for all languages together
summary.vocab.comp <- rbind_list(vocab.comp.english,vocab.comp.spanish,
                                 vocab.comp.norwegian,vocab.comp.danish) %>%
  filter(age > 15 & age < 33) %>%
  mutate(age.group = cut(age, breaks = c(15, 20, 24, 28, 32)),
         language = factor(language,
                           levels=c("English", "Spanish", 
                                    "Norwegian", "Danish")),
         lexical_category = factor(lexical_category, 
                                   levels = c("nouns", "predicates", 
                                              "function_words", "other"),
                                   labels = c("Nouns", "Predicates", 
                                              "Function Words", "Other")))

Plot vocabulary composition by language.

ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
       aes(x=vocab.mean, y=prop, colour=lexical_category, 
           shape = lexical_category, fill = lexical_category,
           label=lexical_category)) +
  geom_point(size = 1, alpha = 0.25) +
  facet_wrap(~ language) +
  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of total vocabulary") +
  scale_x_continuous(name = "Vocabulary Size") +
  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")+
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="none")

Plot vocabulary composition by language and age group

ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
       aes(x=vocab.mean, y=prop, colour=lexical_category, 
           shape = lexical_category, fill = lexical_category,
           label=lexical_category)) +
  geom_jitter(size = 1, alpha = 0.5) +
  facet_grid(language ~ age.group) +
  geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of total vocabulary") +
  scale_x_continuous(name = "Vocabulary Size") +
  geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")+
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="none")

Plot vocabulary composition, now by lexical category.

lang.vocab.comp$lexical_category <- factor(lang.vocab.comp$lexical_category, 
                                           levels=c("function_words","nouns",
                                                    "predicates","other"),
                                           labels=c("Function Words","Nouns",
                                                    "Predicates","Other"))
quartz()
ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
       aes(x=vocab.mean, y=prop, colour = age.group, 
           fill = age.group)) +
  geom_jitter(size = 1, alpha = 0.5) +
  facet_grid(language ~ lexical_category) +
  geom_hline(data=lang.vocab.comp, 
             aes(yintercept=prop.per.cat),
             linetype="dashed", color="grey") + #baselines for each language
  geom_smooth(aes(group=age.group), method='loess', span=0.5) +
  scale_y_continuous(name = "Proportion of total vocabulary") +
  scale_x_continuous(name = "Vocabulary Size") +
  theme_bw(base_size=14) + 
  scale_color_brewer(palette = "Set1") +
  scale_fill_brewer(palette = "Set1")+
  theme(axis.text.x = element_text(angle=-40, hjust = 0),
        axis.title.y = element_text(vjust=0.35),
        axis.title.x = element_text(vjust=-0.5),
        legend.position="none")


(Old stuff that’s being kept around for possible future use.)

# Fit regressions to data
# t.lm1 <- lm(score ~ age + measure, data=filter(ms,language=="English"))
# t.lm2 <- lm(score ~ I(vocab.sum^2)*measure + age*measure, data=filter(ms,language=="English"))
# t.lm3 <- lm(score  ~ I(vocab.sum^2)*measure*age.binned, data=filter(ms,language=="English"))
# t.lm4 <- lm(score  ~ I(vocab.sum^2)*measure*age, data=filter(ms,language=="English"))
# 
# ms$predicted <- predict.lm(t.lm3,ms)
# 
# Plot by age
# ggplot(ms,aes(x = vocab, y = score, colour = measure,label=measure))+
#  facet_wrap(~ age)+
#  geom_jitter(size=1)+
#  geom_line(aes(y=predicted),size=.5)+
#  scale_color_brewer(palette="Set1") +
#  scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") + 
#  scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") + 
#  theme_bw(base_size = 14) 

Replot original correlation with fitted model

{r,fig.width=12,fig.height=7.5} #ggplot(ms,aes(x = vocab, y = score, colour = age.binned, fill = age.binned, # label = age.binned)) + # geom_jitter(size=1.5)+ # geom_line(aes(y=predicted),size=1) + # facet_wrap(~measure) + # scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") + # scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") + # theme_bw(base_size = 14) + # scale_color_brewer(palette="Set1") + # scale_fill_brewer(palette="Set1") #

Compute some descriptives on syntactic items

```{r,fig.width=7,fig.height=4}

compute Kendall’s tau – cor.fk is a faster implementation than in stats::cor

complex.cors <- cor.fk(as.matrix(bykid.syntax.vocab[,8:ncol(bykid.syntax.vocab)])) %>%

as.data.frame

names(complex.cors) <- str_replace(names(complex.cors),“complx”,“”)

row.names(complex.cors) <- str_replace(row.names(complex.cors),“complx”,“”)

make a dendrogram of the complex item similarities

complex.dendro <- as.dendrogram(hclust(dist(complex.cors)))

plot(complex.dendro)

```

Make a confusion matrix

```{r,fig.width=8,fig.height=5}

gather the columns for plotting as a confusion matrix

complex.cors %<>%

mutate(prompt = factor(row.names(complex.cors))) %>%

gather(response,correlation,“01”:“37”)

ggplot(complex.cors, aes(response, prompt)) +

geom_tile(aes(fill = correlation)) +

ylim(rev(levels(complex.cors$prompt))) +

scale_fill_gradient(low = “white”, high = “black”,guide=FALSE) +

labs(x=“Response”, y = “Prompt”) +

theme(legend.position = “none”, axis.ticks = element_blank()) +

theme_bw(base_size = 16)

```

Compute vocab x age interaction terms for each styntactic item

```{r,fig.height=5,fig.width=8}

write regression formulas for separately for each item

formulas <- sapply(names(bykid.syntax.vocab)[8:ncol(bykid.syntax.vocab)],

function(x) paste(x ,“~ I(vocab^2)*age + 0“,collapse=”“))

compute interaction terms each item

interaction.terms <- sapply(formulas, function(x)

summary(glm(as.formula(x),data=bykid.syntax.vocab,

family=“binomial”))$coefficients[3,3])

names(interaction.terms) <- 1:37

rename results to be human-readable

interaction.terms <- as.data.frame(interaction.terms) %>%

mutate(item = 1:37) %>%

rename(zscore = interaction.terms) %>%

arrange(zscore) %>%

mutate(item = factor(item,levels=item))

plot interaction terms by item

ggplot(interaction.terms,

aes(x=item,y=zscore,fill=1))+

geom_bar(stat=“identity”)+

geom_hline(yintercept=mean(interaction.terms$zscore),

lty=2)+

theme_bw(base_size = 14) +

scale_y_continuous(name=“vocabulary size x age z-score”,limits=c(0,15),

breaks=seq(0,15,2.5))+

scale_x_discrete(name=“complexity CDI item”)+

scale_color_brewer(palette=“Set1”) +

theme(legend.position=“none”)

```

Leftover analyses

```{r}

summary(lm(syntax ~ I(vocab^2) * age - 1, data=d))

summary(lm(morpho ~ I(vocab^2) * age - 1, data=d))

summary(lm(syntax ~ I(d\(s.vocab^2) * d\)age_bin - 1, data=d))

summary(lm(morpho ~ I(d\(s.vocab^2) * d\)age_bin - 1, data=d))